{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyalink.alink import *\n",
    "useLocalEnv(1)\n",
    "\n",
    "from utils import *\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option('display.max_colwidth', 1000)\n",
    "\n",
    "DATA_DIR = ROOT_DIR + \"banknote\" + os.sep\n",
    "\n",
    "ORIGIN_FILE = \"data_banknote_authentication.txt\"\n",
    "\n",
    "SCHEMA_STRING = \"variance double, skewness double, kurtosis double, entropy double, class int\"\n",
    "\n",
    "TRAIN_FILE = \"train.ak\"\n",
    "TEST_FILE = \"test.ak\"\n",
    "LR_PRED_FILE = \"lr_pred.ak\"\n",
    "SVM_PRED_FILE = \"svm_pred.ak\"\n",
    "\n",
    "FEATURE_COL_NAMES = [\"variance\", \"skewness\", \"kurtosis\", \"entropy\"]\n",
    "LABEL_COL_NAME = \"class\"\n",
    "\n",
    "VEC_COL_NAME = \"vec\"\n",
    "\n",
    "PREDICTION_COL_NAME = \"pred\"\n",
    "PRED_DETAIL_COL_NAME = \"predinfo\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1\n",
    "source = CsvSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + ORIGIN_FILE)\\\n",
    "    .setSchemaStr(SCHEMA_STRING)\n",
    "\n",
    "print(\"column names of source:\")\n",
    "print(source.getColNames())\n",
    "\n",
    "print(\"column types of source:\")\n",
    "print(source.getColTypes())\n",
    "\n",
    "source.firstN(5).print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_1\n",
    "source = CsvSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + ORIGIN_FILE)\\\n",
    "    .setSchemaStr(SCHEMA_STRING)\n",
    "\n",
    "summary = SummarizerBatchOp().linkFrom(source).collectSummary()\n",
    "print(\"Count of data set : \" + str(summary.count()))\n",
    "print(\"Max value of entropy : \" + str(summary.max(\"entropy\")))\n",
    "print(summary)\n",
    "\n",
    "source.link(\n",
    "    SummarizerBatchOp()\\\n",
    "        .lazyCollectSummary(\n",
    "            lambda tableSummary:(\n",
    "                print(\"Count of data set : \" + str(tableSummary.count())),\n",
    "                print(\"Max value of entropy : \" + str(tableSummary.max(\"entropy\"))),\n",
    "                print(tableSummary)\n",
    "            )\n",
    "        )\n",
    ")\n",
    "\n",
    "source.link(\n",
    "    SummarizerBatchOp().lazyPrintSummary()\n",
    ")\n",
    "\n",
    "source\\\n",
    "    .lazyPrintStatistics(\"<- origin data ->\")\\\n",
    "    .firstN(5)\\\n",
    "    .lazyPrintStatistics(\"<- first 5 data ->\")\\\n",
    "    .print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_1_2\n",
    "source = CsvSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + ORIGIN_FILE)\\\n",
    "    .setSchemaStr(SCHEMA_STRING)\n",
    "\n",
    "correlation = CorrelationBatchOp().linkFrom(source).collectCorrelation()\n",
    "colNames = correlation.getColNames()\n",
    "print(\"Correlation of \" + colNames[0] + \" with \" + colNames[1] \n",
    "      + \" is \" + str(correlation.getCorrelation()[0][1]));\n",
    "print(correlation.getCorrelationMatrix())\n",
    "\n",
    "\n",
    "def print_correlation_info(correlationResult: CorrelationResult):\n",
    "    colNames = correlationResult.getColNames()\n",
    "    print(\"Correlation of \" + colNames[0] + \" with \" + colNames[1]\n",
    "          + \" is \" + str(correlationResult.getCorrelation()[0][1]))\n",
    "    print(correlationResult.getCorrelationMatrix())\n",
    "\n",
    "    \n",
    "source\\\n",
    "    .link(\n",
    "        CorrelationBatchOp()\\\n",
    "            .lazyCollectCorrelation(print_correlation_info)\n",
    "    )\n",
    "\n",
    "source.link(\n",
    "    CorrelationBatchOp().lazyPrintCorrelation(\"< Pearson Correlation >\")\n",
    ")\n",
    "\n",
    "source.link(\n",
    "    CorrelationBatchOp()\\\n",
    "        .setMethod(\"SPEARMAN\")\\\n",
    "        .lazyPrintCorrelation(\"< Spearman Correlation >\")\n",
    ")\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "df_banknote = source.collectToDataframe()\n",
    "\n",
    "sns.pairplot(df_banknote, vars = df_banknote.columns[:-1], hue = 'class')\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.manifold import TSNE\n",
    "\n",
    "tsne = TSNE(n_components = 2, learning_rate = 100).fit_transform(df_banknote.iloc[:, 0:4])\n",
    "\n",
    "plt.scatter(tsne[:, 0], tsne[:, 1], c = df_banknote.iloc[:, 4])\n",
    "plt.colorbar()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_2\n",
    "source = CsvSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + ORIGIN_FILE)\\\n",
    "    .setSchemaStr(SCHEMA_STRING)\n",
    "\n",
    "splitTrainTestIfNotExist(source, DATA_DIR + TRAIN_FILE, DATA_DIR + TEST_FILE, 0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_3\n",
    "train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE)\n",
    "test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE)\n",
    "\n",
    "lrTrainer = LogisticRegressionTrainBatchOp()\\\n",
    "    .setFeatureCols(FEATURE_COL_NAMES)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\n",
    "\n",
    "lrPredictor = LogisticRegressionPredictBatchOp()\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\n",
    "\n",
    "train_data.link(lrTrainer)\n",
    "\n",
    "lrPredictor.linkFrom(lrTrainer, test_data)\n",
    "\n",
    "lrTrainer.lazyPrintTrainInfo().lazyPrintModelInfo()\n",
    "\n",
    "lrPredictor\\\n",
    "    .lazyPrint(5, \"< Prediction >\")\\\n",
    "    .link(\n",
    "        AkSinkBatchOp()\\\n",
    "            .setFilePath(DATA_DIR + LR_PRED_FILE)\\\n",
    "            .setOverwriteSink(True)\n",
    "    )\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_4\n",
    "train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE)\n",
    "test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE)\n",
    "\n",
    "svmTrainer = LinearSvmTrainBatchOp()\\\n",
    "    .setFeatureCols(FEATURE_COL_NAMES)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\n",
    "\n",
    "svmPredictor = LinearSvmPredictBatchOp()\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\n",
    "\n",
    "train_data.link(svmTrainer)\n",
    "\n",
    "svmPredictor.linkFrom(svmTrainer, test_data)\n",
    "\n",
    "svmTrainer.lazyPrintTrainInfo().lazyPrintModelInfo()\n",
    "\n",
    "svmPredictor\\\n",
    "    .lazyPrint(5, \"< Prediction >\")\\\n",
    "    .link(\n",
    "        AkSinkBatchOp()\\\n",
    "            .setFilePath(DATA_DIR + SVM_PRED_FILE)\\\n",
    "            .setOverwriteSink(True)\n",
    "    )\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_5\n",
    "lr_metrics = EvalBinaryClassBatchOp()\\\n",
    "    .setPositiveLabelValueString(\"1\")\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "    .linkFrom(\n",
    "        AkSourceBatchOp().setFilePath(DATA_DIR + LR_PRED_FILE)\n",
    "    )\\\n",
    "    .collectMetrics()\n",
    "\n",
    "print(\"< LR >\")\n",
    "print(\"AUC : \" + str(lr_metrics.getAuc()) \n",
    "      + \"\\t Accuracy : \" + str(lr_metrics.getAccuracy()) \n",
    "      + \"\\t Precision : \" + str(lr_metrics.getPrecision())\n",
    "      + \"\\t Recall : \" + str(lr_metrics.getRecall())\n",
    "     )\n",
    "\n",
    "print(lr_metrics)\n",
    "\n",
    "lr_metrics.saveRocCurveAsImage(DATA_DIR + \"lr_roc.jpg\", True)\n",
    "lr_metrics.saveRecallPrecisionCurveAsImage(DATA_DIR + \"lr_recallprec.jpg\", True)\n",
    "lr_metrics.saveLiftChartAsImage(DATA_DIR + \"lr_lift.jpg\", True)\n",
    "lr_metrics.saveKSAsImage(DATA_DIR + \"lr_ks.jpg\", True)\n",
    "\n",
    "AkSourceBatchOp()\\\n",
    "    .setFilePath(DATA_DIR + SVM_PRED_FILE)\\\n",
    "    .link(\n",
    "        EvalBinaryClassBatchOp()\\\n",
    "            .setPositiveLabelValueString(\"1\")\\\n",
    "            .setLabelCol(LABEL_COL_NAME)\\\n",
    "            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "            .lazyPrintMetrics()\n",
    "            .lazyCollectMetrics(\n",
    "                lambda binaryClassMetrics:(\n",
    "                    binaryClassMetrics.saveRocCurveAsImage(\n",
    "                        DATA_DIR + \"svm_roc.jpg\", True),\n",
    "                    binaryClassMetrics.saveRecallPrecisionCurveAsImage(\n",
    "                        DATA_DIR + \"svm_recallprec.jpg\", True),\n",
    "                    binaryClassMetrics.saveLiftChartAsImage(\n",
    "                        DATA_DIR + \"svm_lift.jpg\", True),\n",
    "                    binaryClassMetrics.saveKSAsImage(\n",
    "                        DATA_DIR + \"svm_ks.jpg\", True)\n",
    "                )\n",
    "            )\n",
    "    )\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_6\n",
    "train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);\n",
    "test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);\n",
    "\n",
    "featureExpand = Pipeline()\\\n",
    "    .add(\n",
    "        VectorAssembler()\\\n",
    "            .setSelectedCols(FEATURE_COL_NAMES)\\\n",
    "            .setOutputCol(VEC_COL_NAME + \"_0\")\n",
    "    )\\\n",
    "    .add(\n",
    "        VectorPolynomialExpand()\\\n",
    "            .setSelectedCol(VEC_COL_NAME + \"_0\")\\\n",
    "            .setOutputCol(VEC_COL_NAME)\\\n",
    "            .setDegree(2)\n",
    "    )\\\n",
    "    .fit(train_data)\n",
    "\n",
    "train_data = featureExpand.transform(train_data);\n",
    "test_data = featureExpand.transform(test_data);\n",
    "\n",
    "train_data.lazyPrint(1);\n",
    "\n",
    "LinearSvm()\\\n",
    "    .setVectorCol(VEC_COL_NAME)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "    .fit(train_data)\\\n",
    "    .transform(test_data)\\\n",
    "    .link(\n",
    "        EvalBinaryClassBatchOp()\\\n",
    "        .setPositiveLabelValueString(\"1\")\\\n",
    "        .setLabelCol(LABEL_COL_NAME)\\\n",
    "        .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "        .lazyPrintMetrics(\"LinearSVM\")\n",
    "    )\n",
    "\n",
    "LogisticRegression()\\\n",
    "    .setVectorCol(VEC_COL_NAME)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "    .fit(train_data)\\\n",
    "    .transform(test_data)\\\n",
    "    .link(\n",
    "        EvalBinaryClassBatchOp()\\\n",
    "            .setPositiveLabelValueString(\"1\")\\\n",
    "            .setLabelCol(LABEL_COL_NAME)\\\n",
    "            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "            .lazyPrintMetrics(\"LogisticRegression\")\n",
    "    )\n",
    "\n",
    "LogisticRegression()\\\n",
    "    .setOptimMethod(\"Newton\")\\\n",
    "    .setVectorCol(VEC_COL_NAME)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "    .fit(train_data)\\\n",
    "    .transform(test_data)\\\n",
    "    .link(\n",
    "        EvalBinaryClassBatchOp()\\\n",
    "            .setPositiveLabelValueString(\"1\")\\\n",
    "            .setLabelCol(LABEL_COL_NAME)\\\n",
    "            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "            .lazyPrintMetrics(\"LogisticRegression + OptimMethod.Newton\")\n",
    "    )\n",
    "\n",
    "BatchOperator.execute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#c_7\n",
    "train_data = AkSourceBatchOp().setFilePath(DATA_DIR + TRAIN_FILE);\n",
    "test_data = AkSourceBatchOp().setFilePath(DATA_DIR + TEST_FILE);\n",
    "\n",
    "FmClassifier()\\\n",
    "    .setNumEpochs(10)\\\n",
    "    .setLearnRate(0.5)\\\n",
    "    .setNumFactor(2)\\\n",
    "    .setFeatureCols(FEATURE_COL_NAMES)\\\n",
    "    .setLabelCol(LABEL_COL_NAME)\\\n",
    "    .setPredictionCol(PREDICTION_COL_NAME)\\\n",
    "    .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "    .enableLazyPrintTrainInfo()\\\n",
    "    .enableLazyPrintModelInfo()\\\n",
    "    .fit(train_data)\\\n",
    "    .transform(test_data)\\\n",
    "    .link(\n",
    "        EvalBinaryClassBatchOp()\\\n",
    "            .setPositiveLabelValueString(\"1\")\\\n",
    "            .setLabelCol(LABEL_COL_NAME)\\\n",
    "            .setPredictionDetailCol(PRED_DETAIL_COL_NAME)\\\n",
    "            .lazyPrintMetrics(\"FM\")\n",
    "    )\n",
    "\n",
    "BatchOperator.execute()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
